In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import sys
import plotly.express as px
import os
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.offline as pyo
sys.tracebacklimit = 0 # turn off the error tracebacks
In [2]:
# Problem 1 - Get user agent and make headers dictionary
r = requests.get('https://httpbin.org/user-agent')
useragent = json.loads(r.text)['user-agent']
headers = {'User-Agent': 'data science passion project (vrd9sd@virginia.edu) (Language=Python 3.8.2; platform=Mac OSX 13.0.1'}

# Url to get HTML from 
url = 'https://waterdata.usgs.gov/va/nwis/current/?type=gw'
r = requests.get(url, headers=headers)

# Parsing HTML code 
mysoup = BeautifulSoup(r.text, 'html.parser')
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[2], line 3
      1 # Problem 1 - Get user agent and make headers dictionary
      2 r = requests.get('https://httpbin.org/user-agent')
----> 3 useragent = json.loads(r.text)['user-agent']
      4 headers = {'User-Agent': 'data science passion project (vrd9sd@virginia.edu) (Language=Python 3.8.2; platform=Mac OSX 13.0.1'}
      6 # Url to get HTML from 

NameError: name 'json' is not defined
In [ ]:
# make empty list of counties
counties=[]
# populate county list with one instance for each site number under that county
for td in mysoup.find_all('td'):
    if 'colspan' in td.attrs:
        county_name = td.strong.text
        current_county = county_name
    elif td.find('a') and td.find('a').get('href', '').startswith('/va/nwis/uv'):
        counties.append(current_county[1:])
In [ ]:
station_number = [x.string for x in mysoup.find_all(lambda tag: tag.has_attr('href') and tag['href'].startswith('/va/nwis/uv'))]
In [3]:
date_and_site = [x.string for x in mysoup.find_all('td', attrs = {'nowrap':'nowrap'})]
dates = [item[1:-1] for i, item in enumerate(date_and_site) if i%2==1]
site_name = [item[1:-1] for i, item in enumerate(date_and_site) if i%2==0]

water_depth_strings = [x.find_all('td')[3].string for x in mysoup.find_all('tr', attrs = {'align':'right'})]
water_depths = [item[:-1] if item != None else np.nan for item in water_depth_strings]
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[3], line 1
----> 1 date_and_site = [x.string for x in mysoup.find_all('td', attrs = {'nowrap':'nowrap'})]
      2 dates = [item[1:-1] for i, item in enumerate(date_and_site) if i%2==1]
      3 site_name = [item[1:-1] for i, item in enumerate(date_and_site) if i%2==0]

NameError: name 'mysoup' is not defined

Making County names consistent with Fips Database¶

In [4]:
counties =  [x.replace('Of', 'of') for x in counties] # uncapitalize 'Of' in places
counties = [x.replace('And', 'and') for x in counties] # uncapitalize 'And' for places
counties = [x[:-4] + 'city' if x.endswith('City') else x for x in counties] # uncapitalize 'City' in places
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[4], line 1
----> 1 counties =  [x.replace('Of', 'of') for x in counties] # uncapitalize 'Of' in places
      2 counties = [x.replace('And', 'and') for x in counties] # uncapitalize 'And' for places
      3 counties = [x[:-4] + 'city' if x.endswith('City') else x for x in counties] # uncapitalize 'City' in places

NameError: name 'counties' is not defined
In [5]:
groundwater_data = pd.DataFrame({
    'Jurisdiction': counties,
    'station number': station_number, 
    'dates': dates,
    'site_name': site_name,
    'water table depth': water_depths})
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[5], line 2
      1 groundwater_data = pd.DataFrame({
----> 2     'Jurisdiction': counties,
      3     'station number': station_number, 
      4     'dates': dates,
      5     'site_name': site_name,
      6     'water table depth': water_depths})

NameError: name 'counties' is not defined

Outputting to CSV File¶

In [34]:
file_path = 'water_table_depths.csv'
if not os.path.exists(file_path):
    groundwater_data.to_csv(file_path, index=False)
else:
    new_data.to_csv(file_path, mode='a', header=False, index=False)

Make the date current eastern time datetime object¶

In [35]:
# from datetime import datetime
# from zoneinfo import ZoneInfo
# edt_timezone = ZoneInfo('US/Eastern')
# datetime_objects = []
# for date in dates:
#     date_string = date.replace(' EDT', '') # strip edt since we set 
#     date_object = datetime.strptime(date_string, "%m/%d %H:%M") # parse into datetime object
#     date_object = date_object.replace(year=datetime.now().year) # Add current year
#     date_object = date_object.astimezone(edt_timezone) # localize to edt timezone
#     datetime_objects.append(date_object) # append to list

# print(datetime_objects[-1])

Merging Fips-County data with groundwater_data¶

In [36]:
data = pd.read_csv('vacounties.csv') # read in vacounties data 
va_fips = data.iloc[:, :2] #isolate county and FIPS columns
In [37]:
# Come back to this to merge correctly (counties named city turns into error)
merged_data = pd.merge(va_fips, groundwater_data, on='Jurisdiction', how='right', indicator=True) # merge groundwater data with counties
In [38]:
merged_data.query("_merge!='both'") # Identify Counties that dont have match in right df Should be empty
Out[38]:
FIPS Jurisdiction station number dates site_name water table depth _merge
In [39]:
merged_data['water table depth'] = merged_data['water table depth'].astype('float')
In [40]:
grouped = merged_data.groupby('Jurisdiction').agg(
    mean_county_depth=('water table depth', 'mean'),
    median_county_depth = ('water table depth', 'median'),
    num_county_stations = ('water table depth', 'size'),
    min_county_depth = ('water table depth', 'min'),
    max_county_depth = ('water table depth', 'max'),
    q25_county_depth = ('water table depth', lambda x: np.percentile(x, 75)),
    q75_county_depth = ('water table depth', lambda x: np.percentile(x, 75)),
    ).reset_index()
grouped.head()
Out[40]:
Jurisdiction mean_county_depth median_county_depth num_county_stations min_county_depth max_county_depth q25_county_depth q75_county_depth
0 Accomack County 47.443333 41.54 12 9.30 101.16 75.0800 75.0800
1 Appomattox County -0.550000 -0.55 1 -0.55 -0.55 -0.5500 -0.5500
2 Augusta County 15.600000 15.60 1 15.60 15.60 15.6000 15.6000
3 Bath County 5.860000 5.86 1 5.86 5.86 5.8600 5.8600
4 Bedford County 16.942500 12.29 4 6.09 37.10 20.0975 20.0975
In [41]:
merged_with_stats = pd.merge(merged_data, grouped, on='Jurisdiction', how='left')
merged_with_stats.head()
Out[41]:
FIPS Jurisdiction station number dates site_name water table depth _merge mean_county_depth median_county_depth num_county_stations min_county_depth max_county_depth q25_county_depth q75_county_depth
0 51001 Accomack County 374425075400001 10/02 16:00 EDT 65K 27 SOW 114A 50.74 both 47.443333 41.54 12 9.3 101.16 75.08 75.08
1 51001 Accomack County 374425075400002 10/02 16:00 EDT 65K 28 SOW 114B 79.40 both 47.443333 41.54 12 9.3 101.16 75.08 75.08
2 51001 Accomack County 374425075400003 10/02 16:00 EDT 65K 29 SOW 114C 101.16 both 47.443333 41.54 12 9.3 101.16 75.08 75.08
3 51001 Accomack County 375315075332101 10/02 16:30 EDT 66M 57 76.16 both 47.443333 41.54 12 9.3 101.16 75.08 75.08
4 51001 Accomack County 375315075332102 10/02 16:30 EDT 66M 58 69.47 both 47.443333 41.54 12 9.3 101.16 75.08 75.08

Look at distribution of groundwater depths¶

In [82]:
sns.histplot(merged_with_stats['water table depth'], bins=9)
plt.xlabel('Water Table Depth (ft)')
plt.ylabel('Frequency')
plt.title('VA Water Table Depths at USGS monitoring locations')
plt.show()
No description has been provided for this image

Making Chloropleth of water table depths by counties¶

In [42]:
# Get geojson for county fips
import json
url = 'https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json'
r = requests.get(url)
geojson_counties = json.loads(r.text)
In [79]:
# Showing Virginia choropleth for FIPS that are in merged groundwater_data
fig = px.choropleth(merged_with_stats,
                    geojson=geojson_counties,
                    color='num_county_stations',
                    scope='usa',
                    locations='FIPS',
                    hover_name = 'Jurisdiction',
                    color_continuous_scale = px.colors.sequential.Greys, # [::-1], # reverses the color scale
                    hover_data = ['num_county_stations', 'mean_county_depth'],
                    title = 'VA Water Table Depth monitoring stations by County Vizualization'
                                  )
fig.update_geos(fitbounds='locations')
fig.show()
In [87]:
merged_with_stats = merged_with_stats[merged_with_stats['num_county_stations'] > 2]
In [88]:
fig = px.choropleth(merged_with_stats,
                    geojson=geojson_counties,
                    color='mean_county_depth',
                    scope='usa',
                    locations='FIPS',
                    hover_name = 'Jurisdiction',
                    color_continuous_scale = px.colors.sequential.PuBu, #[::-1], # reverses the color scale
                    hover_data = [ 'mean_county_depth', 'min_county_depth', 'max_county_depth','num_county_stations'],
                    title = 'Mean water table depths by VA County (> 2 stations)
                                  )
fig.update_geos(fitbounds='locations')
fig.show()